1 Introduction

1.1 Birth Seasonality

It has been well documented that live birth follows a seasonal pattern with more babies born in Spring/Summer in most countries of the northen Hemisphere.

We downloaded monthly live birth data from the UN [http://data.un.org/Data.aspx?d=POP&f=tableCode%3A55]

birth = read.csv(paste0(IO$birth_records_dir,"UNdata_Export_20180713_023958101_all.csv"), header = TRUE)

# removing the rows with the total number of birth - only keeping the monthly data
months = format(ISOdate(2004,1:12,1),"%B")
j = birth$Month %in% months
birth = birth[which(j),]
rm(j)

# proper date
birth$date = as.Date(paste0(birth$Year, "-",birth$Month,"-01"), format = c("%Y-%B-%d"))
range(birth$date)
## [1] "1967-01-01" "2017-12-01"
# matching country names with countriesLow Names
birth$Country.or.Area = gsub("United States of America","United States", birth$Country.or.Area)

# getting geo-location
mat = match(birth$Country.or.Area, countriesLow$NAME)

birth$lat = countriesLow$LAT[mat]
birth$lon = countriesLow$LON[mat]
rm(mat)

# sorting countries by latitude

countries.levels = unique(birth$Country.or.Area)
m = match(countries.levels,countriesLow$NAME )
lat = countriesLow$LAT[m]
o = order(lat, decreasing = TRUE)
countries.levels = countries.levels[o]

birth$Country.or.Area = factor(birth$Country.or.Area, levels = countries.levels)

save(birth, file = paste0(IO$out_Rdata,"UN_birth_data.Rdata"))

rm(countries.levels, m, lat, o)
# "Australia", "New Zealand","Bermuda", "Greenland", "Guadeloupe",
countries = c("Sweden","Denmark","Finland","Canada","Germany", "France", "Switzerland", "United States","Israel","Japan","Guatemala","American Samoa","New Caledonia","Seychelles","Chile")
# order countries by decreasing latitude
mat = match(countries, countriesLow$NAME)
lat = countriesLow$LAT[mat]
o = order(lat, decreasing = TRUE)
countries = countries[o]
rm(o, lat, mat)

# select countries. Selected countries had long term data and included those from Northern and Southern hemisphere
c = which(birth$Country.or.Area %in% countries)


g = ggplot(birth[c,], aes(x = date, y = Value, col = Country.or.Area))
g + geom_line() + ggtitle("Absolute number of live birth for selected countries")

g = ggplot(birth[c,], aes(x = date, y = Value, col = Country.or.Area))
g + geom_line() + facet_grid( Country.or.Area ~ . , scales = "free_y") + 
  theme(strip.text.y = element_text(angle = 90))  + 
  ggtitle("Number of live birth for selected countries /!\ y-scale does not include 0")

countries2 = countries[-which(countries %in% c("Guatemala","Switzerland","Finland"))]

cc = which((birth$Country.or.Area %in% countries2) & (birth$Year >= 2000))


g = ggplot(birth[cc,], aes(x = date, y = Value, col = Country.or.Area))
g + geom_line() + facet_grid( Country.or.Area ~ . , scales = "free_y") + 
  theme(strip.text.y = element_text(angle = 0,hjust = 0),
        axis.text.y = element_blank(),
        axis.ticks.y = element_blank())+ #expand_limits(y=25)+
  guides(col = FALSE)+ ylab("Number of live births (min to max values)")+
  ggtitle("Number of live birth for selected countries in the last 2 decades")

long_ago = 1978:1981

cc = which((birth$Country.or.Area %in% c("Finland","France","United States", "Israel")) & (birth$Year %in% long_ago))


g = ggplot(birth[cc,], aes(x = date, y = Value, col = Country.or.Area))
g + geom_line() + facet_grid( Country.or.Area ~ . , scales = "free_y") + 
  theme(strip.text.y = element_text(angle = 90)) + 
  ggtitle("Number of live birth for selected countries in the late 70's")

cc = which((birth$Country.or.Area %in% countries) & (birth$Year %in% long_ago))

g = ggplot(birth[cc,], aes(x = date, y = Value, col = Country.or.Area))
g + geom_line() + facet_grid( Country.or.Area ~ . , scales = "free_y") + 
  theme(strip.text.y = element_text(angle = 90)) + 
  ggtitle("Number of live birth for selected countries in the late 70's")

recently = 2011:2013

cc = which((birth$Country.or.Area %in% c("Finland","France","United States", "Israel")) & (birth$Year %in% recently))


g = ggplot(birth[cc,], aes(x = date, y = Value, col = Country.or.Area))
g + geom_line() + facet_grid( Country.or.Area ~ . , scales = "free_y") + 
  theme(strip.text.y = element_text(angle = 90)) + 
  ggtitle("Number of live birth for selected countries in recent years")

sh = which(birth$lat < 0)

g = ggplot(birth[sh,], aes(x = date, y = Value, col = Country.or.Area))
g + geom_line() + facet_grid( Country.or.Area ~ . , scales = "free_y") + 
  theme(strip.text.y = element_text(angle = 90)) + 
  ggtitle("Number of live birth for selected countries in the SOUTH hemisphere")

rm(sh)

Martinez-Bakker et al., 2014 have shown that the peak time of birth follows a pattern that depends on the latitude of the considered country.

To better understand this relationship and to later investigate the potential origin of birth seasonality, we do a rhythm analysis of the birth patterns to determine the phase (= the peak time) and the relative amplitude of the birth signals, taken year by year.

BR = data.frame()

for(country in unique(birth$Country.or.Area)){
  
  c = which(birth$Country.or.Area == country)
  b = birth[c,]
  years = table(b$Year)
  years = names(years)[years == 12]
  
  for(y in years){
    
    j = which(b$Year == y)
    
    pft = periodic.fisher.test(t = 1:12, x = b$Value[j], p = 12)
    
    line = data.frame(country = country, year = as.numeric(y), pval = pft$pval , phase = pft$phase, rel.ampl = pft$rel.amp)
    
    BR = rbind(BR, line)
    
  }
}

rm(line,c, b, j,y, country, pft, years)
hist(BR$pval, breaks = 50)

# we expect p-value to be small if there is rhythmicity
BR$qval = p.adjust(BR$pval, method = "BH")
# and plot the phase (in month), show the p-value and order by latitude. 

m = match(BR$country, countriesLow$NAME)
BR$lat = countriesLow$LAT[m]
BR$lon = countriesLow$LON[m]
BR$significant_rhythm = BR$qval<=0.1
BR$month = BR$phase
BR$latitude = BR$lat

rm(m)

g = ggplot(BR, aes(x = phase, y=  lat, col = year, alpha = (qval<=0.1)/2+0.3))
g + geom_point() + xlim(c(0,12)) + scale_x_continuous(breaks = 0:12) +
  ggtitle("Peak month of birth by latitude and years")
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
## Warning: Removed 658 rows containing missing values (geom_point).

g = ggplot(BR, aes(x = phase, y=  lat, col = year, alpha = qval))
g + geom_point() + xlim(c(0,12)) + scale_x_continuous(breaks = 0:12) +
  scale_alpha(range = c(0.1,1))
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
## Warning: Removed 658 rows containing missing values (geom_point).

  ggtitle("Peak month of birth by latitude and years")
## $title
## [1] "Peak month of birth by latitude and years"
## 
## attr(,"class")
## [1] "labels"
g = ggplot(BR, aes(x = month, y=  latitude, col = year, alpha = significant_rhythm))
g + geom_point(size = 0.75) + #xlim(c(0,12)) +
  geom_hline(yintercept = 0)+
  scale_x_continuous(breaks = 0:12) +
  scale_alpha_discrete(range = c(0.2,1))+
  scale_color_gradient(low = "red4", high = "coral1")+
  ggtitle("Peak month of birth by latitude and years")
## Warning: Using alpha for a discrete variable is not advised.

## Warning: Removed 658 rows containing missing values (geom_point).

#mapw = map_data("world")

#g = ggplot()
#g + geom_polygon(data = mapw, aes(x=long, y = lat, group = group), fill=  NA, col = "gray") +
#  geom_point(data =BR, aes(x = lon + year-2000, y = lat, col = phase, alpha = (qval<=0.1)/2+0.3)) 
# non-transparent means significant. Phase is the month number
#rm(mapw)

We indeed observe this latitude-dependency for the peak time of birth. Interestingly, for some countries in the Nothern hemisphere, we also see a shift towards later later peak time in recent years.

Having a closer look to some countries with measurements for many years:

c = which(BR$country %in% countries)

g = ggplot(BR[c,], aes(x = phase, y = year))
g + geom_point(aes(col = (qval <= 0.1))) + 
  facet_grid( country ~ .) + 
  theme_minimal() +
  theme(strip.text.y = element_text(angle = 0))+
  ggtitle("Shift in birth month since the 70's for many countries")

# note that for some countries, the trend for the phase does change over the years

subset_countries =  c("Sweden","Denmark","Finland","Canada","Germany", "France", "Switzerland", "United States","Israel","Japan")

g = ggplot(BR[BR$country %in% subset_countries, ], aes(x = phase, y = year, col = country))
g + geom_point(aes(alpha = (qval <= 0.1)/2+0.5)) +
    geom_path(data = BR[(BR$country %in% subset_countries) & (BR$qval <= 0.1),], aes(x = phase, y = year, col = country)) +
  xlim(c(0,12))+
  theme_minimal() +
  theme(strip.text.y = element_text(angle = 0))+
  ggtitle("Except for the US and Israel, most countries have seen their peak month of birth being delayed")

??? Maybe should first de-trend the data before computing the phase? But it does not seem to correlate with the overall trends…

We were then wondering if the amplitude of the oscillations were also dependent on latitude.

g = ggplot(BR, aes(x =  rel.ampl , y =  lat, col = year, alpha = (qval<=0.1)/2+0.3))
g + geom_point() + ggtitle("Relative amplitude of birth seasonality by latitude")
## Warning: Removed 658 rows containing missing values (geom_point).

It does not seem to be the case.

1.1.1 Birth seasonality - Summary

  • Phase ( = peak month) of birth seasonality depends on latitude.
  • Amplitude (~ fold change) of birth seasonality depends on age of mothers.

In EU and US (which is the main population of the two apps) in recent years, the phases are quite similar (despite the relatively large variation in latitude), we can thus use Sympto and Kindara’s dataset to explore the origin of seasonality: i.e. is it a change in sexual behavior or a change in fertility?

countries.s = c("Germany", "Switzerland", "France","United States")


g = ggplot(BR[BR$country %in% countries.s,], aes(x = phase, y = year, col = country))
g + geom_point() + xlim(c(0,12)) + geom_hline(yintercept = 2010) +
  ggtitle("Countries for which we have users in Sympto's dataset have similar peak month in the last decade")

The peak birth month for these countries in recent years are July-August. The estimated time of conception is thus ~ November.

For Sympto, we have both location and user’s birth year (age). For Kindara, we currently do not have access to this information. Unfortunately, Sympto’s dataset is a lot smaller than Kindara’s dataset.

1.2 Menstrual-cycle & fertility tracking apps

Clue, Kindara, Sympto

2 Methods

2.1 Data preparation

2.1.1 Clue dataset

2.1.1.1 Data overview

User table:

users = read_feather(path = paste0(IO$input_clue,"users.feather"))
head(users)
## # A tibble: 6 x 7
##   user_id birth_year_bin country_area height_bin weight_bin
##   <chr>   <chr>          <chr>        <chr>      <chr>     
## 1 73ce7b… 1995-1999      United Stat… 160-164    65-69     
## 2 235eb9… 2000-2004      United Stat… 160-164    <NA>      
## 3 34be79… 1995-1999      United Stat… 165-169    60-64     
## 4 d21cf3… 2000-2004      United Stat… <155       50-54     
## 5 0169d0… 1985-1989      United Stat… 160-164    55-59     
## 6 caec40… 1975-1979      United Stat… 165-169    65-69     
## # … with 2 more variables: latest_birth_control <chr>, csv_file <chr>
dim(users)
## [1] 12000     7
cycles_00 = read_feather(path = paste0(IO$input_clue,"cycles/cycles0000.feather"))
head(cycles_00)
## # A tibble: 6 x 11
##   user_id cycle_nb cycle_start cycle_end  cycle_length period_start
##   <chr>      <int> <date>      <date>            <int> <date>      
## 1 77fab8…        3 2017-05-18  2017-06-15           29 2017-05-18  
## 2 40ba99…       11 2018-07-04  2018-07-15           12 2018-07-04  
## 3 bd760e…       12 2018-07-28  2018-08-19           23 2018-07-28  
## 4 0125bd…       15 2018-12-16  2019-01-19           35 2018-12-16  
## 5 a4bae9…        3 2018-07-17  2018-08-14           29 2018-07-17  
## 6 f1c046…       27 2019-05-23  2019-06-18           27 2019-05-23  
## # … with 5 more variables: period_end <date>, period_length <int>,
## #   neg_preg_test <lgl>, pos_preg_test <lgl>, latest_preg_test <chr>
dim(cycles_00)
## [1] 6679   11
tracking_folder = paste0(IO$input_clue,"tracking/")
files = list.files(tracking_folder)

tracking = read_feather(path = paste0(tracking_folder,files[1]))
head(tracking)
## # A tibble: 6 x 5
##   user_id                           date       category type         number
##   <chr>                             <date>     <chr>    <chr>        <chr> 
## 1 8dab364e26c969e3f242d1c37158d4b9… 2018-09-11 pain     tender_brea… <NA>  
## 2 64460b1cbc44d72f23a5faed63741e65… 2017-10-19 period   heavy        <NA>  
## 3 7c2bda3a9fd5a18377d397a2748bbb69… 2018-04-29 ANY      <NA>         <NA>  
## 4 5a43343dc6524f888f90b3a408d9800e… 2018-09-15 ANY      <NA>         <NA>  
## 5 65a8fc2d6659af080fe72cc9d9ea2960… 2017-08-18 ANY      <NA>         <NA>  
## 6 2a2c3ee5b3f1567ad56282a907efe657… 2018-09-19 period   light        <NA>
unique(tracking$category)
##  [1] "pain"          "period"        "ANY"           "digestion"    
##  [5] "sleep"         "emotion"       "energy"        "exercise"     
##  [9] "mental"        "pill_hbc"      "sex"           "fluid"        
## [13] "social"        "ailment"       "iud"           "poop"         
## [17] "weight"        "medication"    "bbt"           "injection_hbc"

2.1.1.2 Data preparation

Workflow:

  • Split Country and Area

  • Compute estimated mean BMI

  • Re-assemble the tracking tables into batches, making sure a single user has all its tracking in the same batch

  • Add the users features to the tracking table

  • Label user’s time-series

  • Birth control (as declared by users)

  • Birth control (re-assignment based on users’ declaration and on their tracking history - see Breast Tenderness paper method)

  • User’s tracking activity (convolution by X days from “ANY” tracking to reflect that the user was using the app)

2.1.1.2.1 Countries and Areas
countries_areas = users$country_area %>% str_split_fixed(., " - ", n = 2)
users$country = countries_areas[,1]
users$area = countries_areas[,2]

write_feather(users, path = paste0(IO$output_clue, "users.feather"))
file.copy(from = paste0(IO$output_clue, "users.feather"), to = paste0(IO$tmp_clue, "users_with_countries.feather"))
## [1] FALSE
2.1.1.2.2 BMI
users$height_bin = factor(users$height_bin, levels = dict$height$bin)
users$weight_bin = factor(users$weight_bin, levels = dict$weight$bin)
heigh_mean = dict$height$mean[match(users$height_bin,dict$height$bin)]
weight_mean = dict$weight$mean[match(users$weight_bin,dict$weight$bin)]
users$est_mean_bmi = weight_mean/((heigh_mean/100)^2)

write_feather(users, path = paste0(IO$output_clue, "users.feather"))
file.copy(from = paste0(IO$output_clue, "users.feather"), to = paste0(IO$tmp_clue, "users_with_bmi.feather"))
## [1] TRUE
ggplot(users, aes(x = est_mean_bmi, fill = country_area))+
  geom_histogram(binwidth = 2)+
  facet_grid(country_area ~ ., scale = "free_y")
## Warning: Removed 2730 rows containing non-finite values (stat_bin).

2.1.1.2.3 Batches and tracking tables augmentation
max_batch_size = 5000
n_batch = max(par$min_n_batches, ceiling(nrow(users)/max_batch_size))
batch_size = ceiling(nrow(users)/n_batch)

users$batch = rep(1:n_batch, each = batch_size)[1:nrow(users)]
write_feather(users, path = paste0(IO$output_clue, "users.feather"))
file.copy(from = paste0(IO$output_clue, "users.feather"), to = paste0(IO$tmp_clue, "users_with_batches.feather"))
## [1] TRUE
tracking_folder = paste0(IO$input_clue,"tracking/")
files = list.files(tracking_folder)

tmp_folder = paste0(IO$tmp_clue,"tracking_split_in_batches/")
if(!file.exists(tmp_folder)){dir.create(tmp_folder,recursive = TRUE)}else{file.remove(paste0(tmp_folder,list.files(tmp_folder)))}

cl = makeCluster(par$n_cores)
registerDoParallel(cl)

tic()
users_original_file_ids = foreach(file = files, .combine = rbind, .packages = c("feather","stringr")) %dopar%
{
  tracking = read_feather(path = paste0(tracking_folder, file))
  dim(tracking)
  tracking$batch = users$batch[match(tracking$user_id, users$user_id)]
  if(nrow(tracking)>0){
    for(b in unique(tracking$batch)){
      tracking_batch = tracking[which(tracking$batch == b),]
      new_file_name = gsub(".feather",paste0("_batch_",b,".feather"),file)
      write_feather(tracking_batch, path = paste0(tmp_folder, new_file_name ))
    }
    users_original_file_ids = data.frame(user_id = unique(tracking$user_id), original_file_id = str_extract(file,"\\d{4}"))
  }else{  users_original_file_ids = data.frame(user_id = character(), original_file_id = character())}
  return(users_original_file_ids)
}
toc()
## 4.548 sec elapsed
stopImplicitCluster()

users_o_f = aggregate(original_file_id ~ user_id ,users_original_file_ids, function(x)  paste0(sort(x), collapse = ","))
colnames(users_o_f) = c("user_id","original_tracking_files")
write_feather(users_o_f, path = paste0(IO$tmp_clue,"original_tracking_files_per_users.feather"))
input_folder = paste0(IO$tmp_clue,"tracking_split_in_batches/")
output_folder = paste0(IO$output_clue,"tracking/")
if(file.exists(output_folder)){unlink(output_folder, recursive = TRUE)} ;dir.create(output_folder,recursive = TRUE)
tmp_folder = paste0(IO$tmp_clue,"tracking_in_batches/")
if(file.exists(tmp_folder)){unlink(tmp_folder, recursive = TRUE)} ;dir.create(tmp_folder,recursive = TRUE)

batches = unique(users$batch)
ok = foreach(b = batches) %do% {
  cat(b,"\n")
  all_files = list.files(input_folder)
  files = all_files[grep(paste0("_batch_",b,"\\."),all_files)]
  
  cl = makeCluster(par$n_cores)
  registerDoParallel(cl)
  tracking = foreach(file  = files, .combine = rbind, .packages = "feather") %dopar%{tracking = read_feather(path = paste0(input_folder, file));return(tracking)}
  stopImplicitCluster()
  
  cols_to_add = c("birth_year_bin","country_area","height_bin","weight_bin", "est_mean_bmi")
  m = match(tracking$user_id, users$user_id)
  for(col_to_add in cols_to_add){
    eval(parse(text = paste0("tracking$",col_to_add," = users$",col_to_add,"[m]")))
  }
  o = order(tracking$user_id, tracking$date, tracking$category, tracking$type, tracking$number)
  tracking = tracking[o,]
  
  new_file_name = paste0("tracking_",b,".feather")
  write_feather(tracking, path = paste0(output_folder, new_file_name ))
  file.copy(from = paste0(output_folder, new_file_name ), to = paste0(tmp_folder, new_file_name), overwrite = TRUE)
}
## 1 
## 2 
## 3 
## 4 
## 5
2.1.1.2.4 Augmenting the users table
tracking_folder = paste0(IO$output_clue,"tracking/")
files = list.files(tracking_folder)

cl = makeCluster(par$n_cores)
registerDoParallel(cl)

tic()
users_agg = foreach(file = files, .combine = rbind, .packages = c("feather","stringr", "plyr")) %dopar%
{
  tracking = read_feather(path = paste0(tracking_folder, file))
  users_agg = ddply(tracking,
                    .(user_id),
                    summarize,
                    first_obs = min(date),
                    last_obs = max(date),
                    n_obs_day = length(unique(date)),
                    n_obs = sum(category != "ANY"),
                    n_sex = sum(category == "sex"),
                    n_mucus =  sum(category == "fluid"),
                    n_temp = sum(category == "bbt")
  )
  return(users_agg)
}
toc()
## 11.46 sec elapsed
stopImplicitCluster()

write_feather(users_agg, path = paste0(IO$tmp_clue, "users_agg.feather"))

cols_to_add = setdiff(colnames(users_agg),"user_id")
m = match(users$user_id, users_agg$user_id)
for(col_to_add in cols_to_add){eval(parse(text = paste0("users$",col_to_add," = users_agg$",col_to_add,"[m]")))}
users$n_days = as.numeric(users$last_obs - users$first_obs)

write_feather(users, path = paste0(IO$output_clue, "users.feather"))
file.copy(from = paste0(IO$output_clue, "users.feather"), to = paste0(IO$tmp_clue, "users_augmented.feather"))
## [1] TRUE
2.1.1.2.5 Birth control

As declared by the users:

BC = read_feather(path = paste0(IO$input_clue,"birth_control.feather"))
o = order(BC$user_id, BC$date)
BC = BC[o,]
BC$birth_control = replace_na(BC$birth_control, "undefined")

write_feather(BC, path = paste0(IO$output_clue, "birth_control.feather"))
BC = read_feather(path = paste0(IO$output_clue,"birth_control.feather"))

input_folder = paste0(IO$tmp_clue,"tracking_in_batches/")
output_folder = paste0(IO$output_clue,"tracking/")
tmp_folder = paste0(IO$tmp_clue, "tracking_with_user_defined_birth_control/")
if(!dir.exists(tmp_folder)){dir.create(tmp_folder)}

files = list.files(input_folder)

cl = makeCluster(par$n_cores)
registerDoParallel(cl)

tic()
catch = foreach(file = files, .combine = rbind, .packages = c("feather","stringr", "plyr","tidyverse")) %dopar%
{
  tracking = read_feather(path = paste0(input_folder, file))
  agg = aggregate(date ~ user_id, tracking, min)
  min_date = agg$date[match(tracking$user_id, agg$user_id)]
  tracking = tracking %>%  mutate(rel_date = as.numeric(date - min_date + 1))
  
  u_tracking = tracking %>% select(user_id, date) %>%  unique() %>%  mutate(birth_control = NA,pill_type = NA, pill_regiment = NA)
  u_BC = BC %>% select(-csv_file) %>% filter(user_id %in% unique(tracking$user_id)) %>%  unique() 
  BC_exp = rbind(u_tracking, u_BC) %>%  arrange(., user_id, date)
  
  # need to expand the birth_control variables to replace the NAs with the latest value in the file
  BC_exp = BC_exp %>% group_by(user_id) %>% mutate(
    birth_control = replace_NAs_with_latest_value(birth_control) %>%  replace_na("undefined"),
    pill_type = replace_NAs_with_latest_value(pill_type),
    pill_regiment = replace_NAs_with_latest_value(pill_regiment)
  )
  
  tracking_key = str_c(tracking$user_id, "_",tracking$date)
  BC_key = str_c(BC_exp$user_id,"_",BC_exp$date)
  m = match(tracking_key,BC_key)
  tracking$birth_control_ud = BC_exp$birth_control[m]
  tracking$pill_type_ud = BC_exp$pill_type[m]
  tracking$pill_regiment_ud = BC_exp$pill_regiment[m]
  
  
  write_feather(tracking, path = paste0(output_folder,file))
  file.copy(from = paste0(output_folder,file), to = paste0(tmp_folder,file), overwrite = TRUE)
}
toc()
## 39.986 sec elapsed
stopImplicitCluster()
2.1.1.2.6 Users’ active tracking periods
active_tracking_filter = function(x, N = 28*1.5){
  #res = pmin(1,stats::filter(x, filter = rep(1,N), sides = 1))
  #res = pmin(1,stats::filter(x, filter = rep(1,N), method = "recursive"))
  #res[is.na(res)] = 0
  res = pmin(1,stats::filter(c(rep(0,N-1),x), filter = rep(1,N), sides = 1))
  res = res[-which(is.na(res))]
  return(res)
}
input_folder = paste0(IO$output_clue,"tracking/")
files = list.files(input_folder)

output_folder = paste0(IO$tmp_clue, "active_tracking/")
if(!dir.exists(output_folder)){dir.create(output_folder)}


time_vec = seq(min(users$first_obs), max(users$last_obs), by = 1)

cl = makeCluster(par$n_cores)
registerDoParallel(cl)
tic()
ok = foreach(file = files, .packages = c("dplyr", "feather"), .export = "active_tracking_filter") %dopar% {
  tracking = read_feather(path = paste0(input_folder, file))
  
  any_tracking = tracking %>% filter(. , category == "ANY")  %>%  select(c(user_id, date, birth_control_ud))
  any_tracking$tracking_days = 1
  tmp = data.frame(user_id = rep(unique(tracking$user_id), each = length(time_vec)), date = rep(time_vec, lu(tracking$user_id)))
  active_tracking = dplyr::full_join(tmp,any_tracking, by = c("user_id","date"))
  active_tracking$tracking_days[is.na(active_tracking$tracking_days)] = 0
  active_tracking$birth_control_ud = ave(active_tracking$birth_control_ud, by =  active_tracking$user_id, FUN = replace_NAs_with_latest_value)
  active_tracking = active_tracking %>%  group_by(user_id) %>%  
    mutate(., tracking = active_tracking_filter(tracking_days))
  active_tracking = active_tracking %>% select(user_id, date,birth_control_ud, tracking)
  
  # compressed table
  active_tracking  = active_tracking %>%  group_by(user_id, birth_control_ud) %>% mutate(transition = diff(c(0, tracking)))
  active_tracking  = active_tracking %>%  group_by(user_id) %>% mutate(stretch_num = cumsum(transition == 1))
  active_tracking  = active_tracking %>%  group_by(user_id, birth_control_ud, stretch_num, tracking) %>%  
    mutate(stretch_length = sum(tracking))
  compressed_tracking = active_tracking %>% filter(transition == 1) %>%  select(user_id, date,birth_control_ud, stretch_num, stretch_length) %>%  rename(start_date = date)

  file_name = paste0("active_",file)
  write_feather(compressed_tracking, path = paste0(output_folder,file_name))
}
## Warning in e$fun(obj, substitute(ex), parent.frame(), e$data): already
## exporting variable(s): active_tracking_filter
toc()
## 22.868 sec elapsed
stopImplicitCluster()

2.2 Defining sexual behavior, fertility and health indicators

2.2.1 Clue dataset

users = read_feather(path = paste0(IO$output_clue, "users.feather"))

2.2.1.1 Population indicators

time_vec = seq(min(users$first_obs), max(users$last_obs), by = 1)
input_folder = paste0(IO$output_clue,"tracking/")
files = list.files(input_folder)

input_active_tracking = paste0(IO$tmp_clue,"active_tracking/")

tracking_pop_agg = foreach(file = files, .combine = rbind, .packages = c("dplyr","tidyverse")) %do% {
  # tracking
  tracking = read_feather(path = paste0(input_folder, file))
  tracking$BC = dict$BC$type[match(tracking$birth_control_ud, dict$BC$birth_control)]
  tracking =  filter(tracking, BC %in% c("F","I"))
  
  # variables aggregates
  tracking_pop_agg = ddply(tracking,
                           .(date,country_area,BC),
                           summarise,
                           n_prot_sex = sum((category == "sex") & (type == "protected_sex"), na.rm = TRUE),
                           n_unprot_sex = sum((category == "sex") & (type == "unprotected_sex"), na.rm = TRUE),
                           n_wd_sex = sum((category == "sex") & (type == "withdrawal_sex"), na.rm = TRUE)
  )
  
  tracking_pop_agg = tracking_pop_agg %>%  mutate(n_sex = n_prot_sex + n_unprot_sex + n_wd_sex)
  
  # active tracking
  active_tracking_compressed = read_feather(path = paste0(input_active_tracking,"active_",file))
  active_tracking = expand_compressed_tracking(active_tracking_compressed)
  active_tracking$BC = dict$BC$type[match(active_tracking$birth_control_ud, dict$BC$birth_control)]
  active_tracking =  filter(active_tracking, BC %in% c("F","I"))
  
  # total number of users
  active_tracking$country_area = tracking$country_area[match(active_tracking$user_id, tracking$user_id)]
  active_tracking_agg = ddply(active_tracking,
                              .(date,country_area,BC),
                              summarise,
                              n_users = sum(tracking, na.rm = TRUE)
  )
  
  
  
  tmp = dplyr::full_join(x = active_tracking_agg , y = tracking_pop_agg, by = c("date","country_area","BC")) %>%  
    arrange(country_area, BC, date) %>% 
    replace_na(list(n_prot_sex = 0,n_unprot_sex = 0, n_wd_sex= 0, n_sex = 0))
  
  return(tmp)
}

# sum the results from all files
tmp = tracking_pop_agg %>% group_by(date, country_area, BC) %>% 
  summarise_each(.,sum) %>%  arrange(country_area, BC, date)
# expand to have one row per possible date
tmp2 = expand.grid(date = time_vec, country_area = unique(tmp$country_area), BC = unique(tmp$BC))
tmp3 = dplyr::full_join(tmp, tmp2, by = c("date","country_area","BC")) %>%  
  arrange(country_area, BC, date) %>%  
  replace_na(., list(n_users = 0,n_prot_sex = 0,n_unprot_sex = 0, n_wd_sex= 0, n_sex = 0))
## Warning: Column `country_area` joining character vector and factor,
## coercing into character vector
## Warning: Column `BC` joining character vector and factor, coercing into
## character vector
# final table
tracking_pop_agg = tmp3

# save the results
indicators_folder = paste0(IO$output_clue, "pop_indicators/")
if(!dir.exists(indicators_folder)){dir.create(indicators_folder)}
write_feather(tracking_pop_agg, path = paste0(indicators_folder, "sex_pop_indicators.feather"))
ggplot(tracking_pop_agg, aes(x = date, y = n_users, col = BC))+
  geom_line()+
  facet_wrap(country_area ~ .)

ggplot(tracking_pop_agg, aes(x = date, y = n_sex/n_users, col = BC))+
  geom_line()+
  facet_wrap(country_area ~ .)

ggplot(tracking_pop_agg %>%  filter(date > as.Date("2017-06-30")), aes(x = date, y = n_sex/n_users, col = BC))+
  geom_line()+
  facet_grid(country_area ~ BC)

ggplot(tracking_pop_agg %>%  filter(date > as.Date("2017-06-30")), aes(x = date, y = n_prot_sex/n_users, col = BC))+
  geom_line()+
  facet_grid(country_area ~ BC)

ggplot(tracking_pop_agg %>%  filter(date > as.Date("2017-06-30")), aes(x = date, y = n_unprot_sex/n_users, col = BC))+
  geom_line()+
  facet_grid(country_area ~ BC)

tracking_pop_agg = tracking_pop_agg %>% mutate(weekday = wday(date, week_start = 1),
                                               month = month(date),
                                               date_month = year(date)+(month-1)/12)


ggplot(tracking_pop_agg %>%  filter(date > as.Date("2017-06-30")), 
       aes(x = factor(weekday), y =  n_sex/n_users, col = BC))+
  geom_violin(draw_quantiles = 0.5)+
  facet_grid(country_area ~ BC)
## Warning in regularize.values(x, y, ties, missing(ties)): collapsing to
## unique 'x' values

## Warning in regularize.values(x, y, ties, missing(ties)): collapsing to
## unique 'x' values

## Warning in regularize.values(x, y, ties, missing(ties)): collapsing to
## unique 'x' values

ggplot(tracking_pop_agg %>%  filter(date > as.Date("2017-06-30")), 
       aes(x = factor(month), y =  n_sex/n_users, col = BC))+
  geom_violin(draw_quantiles = 0.5)+
  facet_grid(country_area ~ BC)

ggplot(tracking_pop_agg %>%  filter(date > as.Date("2017-06-30")), 
       aes(x = factor(date_month), y =  n_sex/n_users, col = BC))+
  geom_violin(draw_quantiles = 0.5)+
  facet_grid(country_area ~ BC)

ggplot(tracking_pop_agg %>%  filter(date > as.Date("2017-06-30")), 
       aes(x = date_month, y =  n_sex/n_users, col = BC, fill = BC))+
  stat_summary(geom="ribbon", 
               fun.ymin = function(x) quantile(x, 0.05), 
               fun.ymax = function(x) quantile(x, 0.95), 
               alpha = 0.3, col = NA) +   
  stat_summary(geom="ribbon", 
               fun.ymin = function(x) quantile(x, 0.25), 
               fun.ymax = function(x) quantile(x, 0.75), 
               alpha = 0.3, col = NA) +  
  stat_summary(geom = "line", fun.y=median) +
  facet_grid(country_area ~ BC, scale = "free_y")

ggplot(tracking_pop_agg %>%  filter(date > as.Date("2017-06-30")), 
       aes(x = date_month, y =  n_sex/n_users, col = country_area))+
  stat_summary(geom = "line", fun.y=median) +
  facet_grid(. ~ BC)

2.3 Detecting seasonal patterns in the individual and population indicators

2.4 Birth model

3 Results

3.1 Datasets and user’s demographics

3.1.1 Clue

users = read_feather(path = paste0(IO$output_clue, "users.feather"))
users$country_area = factor(users$country_area, levels = names(sort(table(users$country_area))))
ggplot(users, aes(x = country_area, fill = country_area)) + coord_flip() + geom_bar() + guides(fill = FALSE)

ggplot(users, aes(x = birth_year_bin)) + geom_bar() + facet_grid(country_area ~ .) + guides(fill = FALSE)

ggplot(users, aes(x = height_bin)) + geom_bar() + facet_grid(country_area ~ .) + guides(fill = FALSE)

ggplot(users, aes(x = weight_bin)) + geom_bar() + facet_grid(country_area ~ .) + guides(fill = FALSE)

ggplot(users, aes(x = est_mean_bmi)) +  geom_histogram(binwidth = 1) + facet_grid(country_area ~ .) + guides(fill = FALSE)
## Warning: Removed 2730 rows containing non-finite values (stat_bin).

users$latest_birth_control = factor(users$latest_birth_control, levels = names(sort(table(users$latest_birth_control))))
ggplot(users, aes(x = latest_birth_control, fill = latest_birth_control)) + coord_flip() + geom_bar() + guides(fill = FALSE)